ScRNA seq data from Calini representing PBMCs from 2 individuals (male age 60,52) in different storing conditions: - fresh - fixed with MetOH - stored in 15% DMSO
Libraries
suppressPackageStartupMessages({
library(pheatmap)
library(scater)
library(dplyr)
library(reshape2)
library(ggplot2)
library(cowplot)
library(mvoutlier)
library(Matrix)
library(purrr)
library(gplots)
library(scran)
library(Seurat)
library(mclust)
library(readxl)
library(DropletUtils)
library(magrittr)
library(LSD)
library(CellMixS)
})
datadir <- "/home/Shared_s3it/_home_Shared_taupo_data_seq/calini_scrnaseq/FGCZ_human"
data_path <- "/home/aluetg/scRNA/pbmc_media2/data"
meta_dir <- "20190312 NovaSeqRun Samples Characteristics.xlsx"
seed <- 1234
Load data and generate a SingleCellExperiment object from them
samples <- list.files(datadir, pattern="CR05", full.names = TRUE)
samples <- samples %>% extract(.!=paste0(datadir, "/CR059"))
samples <- samples %>% extract(.!=paste0(datadir, "/CR052"))
names(samples) <- basename(samples)
samples <- paste0(samples, "/outs/filtered_feature_bc_matrix")
samples <- samples[file.exists(paste0(samples, "/matrix.mtx.gz"))]
sce <- DropletUtils::read10xCounts(samples=samples)
sce$dataset <- basename(gsub("/outs/filtered_feature_bc_matrix","",as.character(sce$Sample)))
meta <- read_excel(paste0(datadir, "/", meta_dir))
sce$Sample <- meta[["Sample Name"]][match(sce$dataset, meta$`Sequencing ID`)]
sce$group <- meta[["Sample Characteristics"]][match(sce$dataset, meta$`Sequencing ID`)]
sce$Sample <- factor(paste(sce$Sample, sce$group))
sce$dataset <- factor(sce$dataset)
colnames(sce) <- paste0(sce$dataset, ".", sce$Barcode)
rownames(sce) <- paste0(rowData(sce)$ID, ".", rowData(sce)$Symbol)
##Add patient id
sce$patient <- ifelse(grepl("D301089", sce$Sample), "pat1", "pat2")
sce$media <- ifelse(grepl("Fresh", sce$Sample), "fresh", ifelse(grepl("DMSO", sce$Sample), "DMSO", "MetOH"))
## Filter out genes that are not expressed in any cell
sce <- sce[which(rowSums(counts(sce) > 0) > 0), ]
dim(sce)
## [1] 23886 40717
## Mitochondrial genes
## Find mitochondrial genes
is.mito <- grepl("\\MT-", rownames(sce))
summary(is.mito)
## Mode FALSE TRUE
## logical 23873 13
rownames(sce)[is.mito]
## [1] "ENSG00000198888.MT-ND1" "ENSG00000198763.MT-ND2"
## [3] "ENSG00000198804.MT-CO1" "ENSG00000198712.MT-CO2"
## [5] "ENSG00000228253.MT-ATP8" "ENSG00000198899.MT-ATP6"
## [7] "ENSG00000198938.MT-CO3" "ENSG00000198840.MT-ND3"
## [9] "ENSG00000212907.MT-ND4L" "ENSG00000198886.MT-ND4"
## [11] "ENSG00000198786.MT-ND5" "ENSG00000198695.MT-ND6"
## [13] "ENSG00000198727.MT-CYB"
sce <- calculateQCMetrics(sce, feature_controls = list(Mt = is.mito))
plotHighestExprs(sce,n=30)
#Plot functions
outlierPlot <- function(cd, feature, aph=NULL, logScale=FALSE, show.legend=TRUE){
if(is.null(aph)) aph <- paste0(feature, "_drop")
if(!(aph %in% colnames(cd))) aph <- NULL
p <- ggplot(as.data.frame(cd), aes_string(x = feature, alpha = aph)) +
geom_histogram(show.legend=show.legend)
if(!is.null(aph)) p <- p + scale_alpha_manual(values = c("TRUE" = 0.4, "FALSE" = 1))
if(logScale) p <- p + scale_x_log10()
p
}
plQCplot <- function(cd, show.legend=TRUE){
ps <- lapply(split(cd,cd$dataset), sl=show.legend, FUN=function(x,sl){
list( outlierPlot( x, "total_counts", logScale=T, show.legend=sl),
outlierPlot( x, "total_features_by_counts", "total_features_drop",
logScale=T, show.legend=sl),
outlierPlot( x, "pct_counts_Mt", "mito_drop", show.legend=sl)
)
})
plot_grid( plotlist = do.call(c, ps),
labels=rep(basename(names(ps)), each=length(ps[[1]])),
ncol=length(ps[[1]]),
label_x=0.5 )
}
#Filtering
sce$total_counts_drop <- isOutlier(sce$total_counts, nmads = 2.5,
type = "both", log = TRUE, batch=sce$dataset)
sce$total_features_drop <- isOutlier(sce$total_features_by_counts, nmads = 2.5,
type = "both", log = TRUE, batch=sce$dataset)
sce$mito_drop <- sce$pct_counts_Mt > 5 &
isOutlier(sce$pct_counts_Mt, nmads = 2.5, type = "higher", batch=sce$dataset)
sce$isOutlier <- sce$total_counts_drop | sce$total_features_drop | sce$mito_drop
#Plot
#pdf(file = "/home/aluetg/scRNA/pbmc_media2/scripts/pbmc_media2_pre_qc_files/figure-html/qc_plots.pdf", width = 15, height = 50, paper = "letter")
p <- plQCplot(colData(sce), show.legend=FALSE)
p
save_plot("/home/aluetg/scRNA/pbmc_media2/data/res/qc_filtering.png", p, ncol = 3, nrow = length(levels(sce$dataset)))
#dev.off()
plotColData(sce, x = "total_features_by_counts", y = "pct_counts_Mt",colour = "dataset")
plotColData(sce, x = "total_features_by_counts", y = "pct_counts_Mt",colour = "media")
plotColData(sce, x = "total_counts", y = "pct_counts_Mt",colour = "media")
plotColData(sce, x = "total_counts", y = "pct_counts_Mt",colour = "dataset")
mets <- c("total_counts_drop","total_features_drop","mito_drop")
sapply(mets, FUN=function(x){ sapply(mets, y=x, function(x,y){ sum(sce[[x]] & sce[[y]]) }) })
## total_counts_drop total_features_drop mito_drop
## total_counts_drop 4809 4100 904
## total_features_drop 4100 6137 813
## mito_drop 904 813 2705
nbcells <- cbind(table(sce$Sample),table(sce$Sample[!sce$isOutlier]))
colnames(nbcells) <- c("cells total","cells after filtering")
nbcells
## cells total
## Fresh sample D301089 M age 52 (1967) Purified PBMC 8197
## Fresh sample D301093 M age 60 (1959) Purified PBMC 4778
## Sample D301089 M age 52 (1967) fixed with MetOH Purified PBMC 8807
## Sample D301089 M age 52 (1967) stored with 15% DMSO Purified PBMC 7700
## Sample D301093 M age 60 (1959) fixed with MetOH Purified PBMC 3277
## Sample D301093 M age 60 (1959) stored with 15% DMSO Purified PBMC 7958
## cells after filtering
## Fresh sample D301089 M age 52 (1967) Purified PBMC 6169
## Fresh sample D301093 M age 60 (1959) Purified PBMC 3265
## Sample D301089 M age 52 (1967) fixed with MetOH Purified PBMC 8170
## Sample D301089 M age 52 (1967) stored with 15% DMSO Purified PBMC 6264
## Sample D301093 M age 60 (1959) fixed with MetOH Purified PBMC 2406
## Sample D301093 M age 60 (1959) stored with 15% DMSO Purified PBMC 5865
layout(matrix(1:2,nrow=1))
LSD::heatscatter( sce$total_counts, sce$total_features_by_counts, xlab="Total counts", ylab="Non-zero features", main="",log="xy")
w <- which(!sce$isOutlier)
LSD::heatscatter( sce$total_counts[w], sce$total_features_by_counts[w], xlab="Total counts", ylab="Non-zero features", main="Filtered cells",log="xy")
sce <- sce[,!sce$isOutlier]
sce <- sce[which(rowSums(counts(sce)>1)>=20),]
dim(sce)
## [1] 8331 32139
table(sce$dataset)
##
## CR053 CR054 CR055 CR056 CR057 CR058
## 6169 3265 6264 5865 8170 2406
#Normalize all datasets together
sce <- computeSumFactors(sce)
## Warning in FUN(...): encountered negative size factor estimates
## Warning in FUN(...): encountered negative size factor estimates
## Warning in FUN(...): encountered negative size factor estimates
## Warning in FUN(...): encountered negative size factor estimates
sce <- normalize(sce)
fit_sce <- trendVar(sce, use.spikes=FALSE)
dec_sce <- decomposeVar(sce, fit_sce)
dec_sce$Symbol <- rowData(sce)$Symbol
dec_sce <- dec_sce[order(dec_sce$bio, decreasing = TRUE), ]
hvg <- rownames(dec_sce)[dec_sce$bio > 0]
metadata(sce)$hvg_genes <- hvg
sce <- runPCA(sce, ncomponents = 20, feature_set = hvg)
sce <- runUMAP(sce, feature_set = hvg)
sce <- runTSNE(sce, feature_set = hvg)
visGroup(sce, "dataset")
visGroup(sce, "media")
visGroup(sce, "patient")
visGroup(sce, "dataset", dim_red = "UMAP")
visGroup(sce, "patient", dim_red = "UMAP")
visGroup(sce, "media", dim_red = "UMAP")
visGroup(sce, "dataset", dim_red = "PCA")
seurat <- CreateSeuratObject(counts = counts(sce), meta.data = as.data.frame(colData(sce)), min.cells = 0, min.features = 0, project = "10X", names.delim = ".")
seurat <- NormalizeData(object = seurat)
seurat <- FindVariableFeatures(object = seurat)
seurat <- ScaleData(object = seurat, verbose = FALSE)
hvg_seurat <- seurat@assays$RNA@var.features
hvg_overlap <- intersect(hvg_seurat, hvg)
length(hvg_overlap)
## [1] 810
length(hvg_seurat)
## [1] 2000
length(hvg)
## [1] 3636
head(hvg_overlap)
## [1] "ENSG00000244734.HBB" "ENSG00000188536.HBA2"
## [3] "ENSG00000206172.HBA1" "ENSG00000163220.S100A9"
## [5] "ENSG00000118785.SPP1" "ENSG00000275302.CCL4"
seurat <- RunPCA(object = seurat, npcs = 20, verbose = FALSE)
seurat <- FindNeighbors(object = seurat, reduction = "pca", dims = 1:20)
## Computing nearest neighbor graph
## Computing SNN
seurat <- FindClusters(object = seurat, resolution = 0.2, random.seed = seed)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 32139
## Number of edges: 1165442
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9617
## Number of communities: 16
## Elapsed time: 10 seconds
seurat <- RunTSNE(object = seurat, perplexity = 30, reduction = "pca", dims = 1:20)
seurat <- RunUMAP(object = seurat, reduction = "pca", dims = 1:10, n.neighbors = 30, min.dist = 0.5)
plot_grid(
DimPlot(seurat, reduction = "tsne", group.by="dataset", label = T, label.size = 4, do.return = T),
DimPlot(seurat, reduction = "tsne", group.by="RNA_snn_res.0.2", do.return = T),
DimPlot(seurat, reduction = "umap", group.by="dataset", do.return = T),
DimPlot(seurat, reduction = "umap", group.by="RNA_snn_res.0.2", label = T, label.size = 4, do.return = T)
)